import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
II Visualization of distributional data (“displot”)
%%javascript
= function(lines) {
IPython.OutputArea.prototype._should_scroll return false; // disable auto scrolling
}
= sns.load_dataset("penguins")
penguins penguins.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
Histogram with continuous data
sns.displot(penguins,="flipper_length_mm") x
sns.displot(penguins,="flipper_length_mm",
x=7.1) binwidth
sns.displot(penguins,="flipper_length_mm",
x=20) bins
Bindwidths too small can break histograms
="flipper_length_mm",
sns.displot(penguins, x=0.3) binwidth
sns.displot(penguins,="flipper_length_mm",
x=30) # binwdith too big, the two hills in the data are not visible binwidth
sns.displot(penguins,="flipper_length_mm",
x=15) bins
Histogram with discrete data (“party size”)
= sns.load_dataset("tips")
tips tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
sns.displot(tips,="size",
x=True) discrete
Histogram with discrete data (weekdays)
sns.displot(tips,="day")
x# no need to specify discrete=True beacuse seaborn figures it out on its own
Distribution of data differentiated based on categorical variable
sns.displot(penguins,="flipper_length_mm",
x="species") hue
sns.displot(penguins,="flipper_length_mm",
x="species",
hue='island') col
Histogram stacking versus histogram overlap
With stacking:
sns.displot(penguins,="flipper_length_mm",
x="species",
hue="stack") multiple
Histogram stacking versus histogram overlap versus dodge
With dodging:
sns.displot(penguins,="flipper_length_mm",
x="species",
hue="dodge") multiple
Different subplots for different value on a categorical variable
sns.displot(penguins,="flipper_length_mm",
x="sex") col
sns.displot(penguins,="flipper_length_mm",
x="sex",
col='species',
hue='island',
row="dodge") multiple
Kernel Density Estimation (KDE) plots to smooth histograms
sns.displot(penguins,="flipper_length_mm",
x="kde") kind
sns.displot(penguins,="flipper_length_mm",
x="kde",
kind=0.05) # setting the bandwidth
bw_method# overfitting
# curve is jittery and the jitter is from noise, bandwidth is too small
sns.displot(penguins,="flipper_length_mm",
x="kde",
kind=0.3) # setting the bandwidth bw_method
sns.displot(penguins,="flipper_length_mm",
x="kde",
kind=2) # setting the bandwidth
bw_method# underfitting:
# bandwidth too big, curve too smoothed out, not informative
sns.displot(penguins,="flipper_length_mm",
x="species",
hue="kde") kind
sns.displot(penguins,="flipper_length_mm",
x="species",
hue='island',
col="kde") kind
sns.displot(penguins,="flipper_length_mm",
x="species",
hue="kde",
kind=True) fill
sns.displot(penguins,="flipper_length_mm",
x="species",
hue="kde",
kind=True,
fill="stack") multiple
2-dimensional distributional plots
Histograms in 2d (also known as heatmap)
sns.displot(penguins,="bill_length_mm",
x="bill_depth_mm") y
sns.displot(penguins,="bill_length_mm",
x="bill_depth_mm",
y=True) # adding a colorbar cbar
sns.jointplot(penguins,="bill_length_mm",
x="bill_depth_mm",
y='hex') kind
KDE plots in 2d
sns.displot(penguins,="bill_length_mm",
x="bill_depth_mm",
y="kde") kind
Controlling the number of isolines and the threshold for the smallest isoline
sns.displot(penguins,="bill_length_mm",
x="bill_depth_mm",
y="kde",
kind=12,
levels=0.02) thresh
2d histograms differentiated with colors for different species
sns.displot(penguins,="bill_length_mm",
x="bill_depth_mm",
y="species",
hue='island') col
2d KDE plots differentiated with colors for different species
sns.displot(penguins,="bill_length_mm",
x="bill_depth_mm",
y="species",
hue='island',
col="kde") kind
Changing binwidth (in two diretions)
sns.displot(penguins,="bill_length_mm",
x="bill_depth_mm",
y=(3, 1)) binwidth
Visualizing 2d distributions and 1d marginals with sns.jointplot()
=penguins,
sns.jointplot(data="bill_length_mm",
x="bill_depth_mm",
y='X'
marker )
=penguins,
sns.jointplot(data="bill_length_mm",
x="bill_depth_mm",
y='hist'
kind )
=penguins,
sns.jointplot(data="bill_length_mm",
x="bill_depth_mm",
y='species',
hue='kde'
kind )
visualizing 2d distributions and 1d marginals
sns.jointplot(=penguins,
data="bill_length_mm",
x="bill_depth_mm",
y="species",
hue="kde"
kind )
sns.jointplot(penguins,="bill_length_mm",
x="bill_depth_mm",
y="species",
hue="kde") kind
Rug: visualizing 2d dist AND 1d locations of single points
Multiple layers: for instance, both scatter plot and KDE plots, both rugs and marginal plots
= sns.jointplot(data=penguins,
g ="bill_length_mm",
x="bill_depth_mm")
y
g.plot_joint(sns.kdeplot,="red") color
# scatter plot in blue
= sns.jointplot(data=penguins,
g ="bill_length_mm",
x="bill_depth_mm")
y
# kde plot in red, same plot
g.plot_joint(sns.kdeplot,="red")
color
# rug plot in green
g.plot_marginals(sns.rugplot,="green", height=0.15) color